knitr::opts_chunk$set( collapse = TRUE, fig.width = 8, fig.height = 6, fig.align = "center", comment = "#>" ) options(rmarkdown.html_vignette.check_title = FALSE)
In this vignette, we will walk you through how to curate your microbiome data extracted from pipelines such as QIIME 2 into a format suitable for MrIML. The main problem with using the raw data as the ASV names a long character vectors of nucleotide data which will make interpretation impossible. The data set is just 10 ASVs from the ostrich case study in Fountain-Jones et al (2024) to keep things simple. We will check out the data and load the required packages.
library(mrIML) library(tidyverse) #raw ASV data load("asv_data.RData") str(asv_data) #corresponding taxa table load("asv_taxa_table.RData") str(asv_taxa_table)
The first step is to prepare the taxa table by removing columns and making sure columns aren't identical. This will help when we start to create meaningful names for the ASVs.
#---------------------------------------------------- # Prepare ASV table #---------------------------------------------------- #ASV column not needed asv_taxa_table$ASV <- NULL asv_taxa_table$Kingdom<- NULL asv_taxa_table$Class<- NULL asv_taxa_table$Phylum<- NULL #create_names create_name <- function(row) { non_na_values <- na.omit(row) unique_values <- unique(non_na_values) # Remove duplicates if (length(unique_values) > 3) { unique_values <- unique_values[1:2] } # Extract genus and species names if available family <- ifelse("Family" %in% names(row), row["Family"], NA) genus <- ifelse("Genus" %in% names(row), row["Genus"], NA) species <- ifelse("Species" %in% names(row), row["Species"], NA) # Combine genus, species, and unique values into a name name_parts <- c(family,genus, species, unique_values) name_parts <- name_parts[!is.na(name_parts)] # Remove NA values return(paste(name_parts, collapse = "_")) } # Create the 'Name' column asv_taxa_table$Name <- apply(asv_taxa_table, 1, create_name)
The next functions take the ASV table data and make unique names for the new column in the taxa table.
#make names unique make_names_unique <- function(names_vector, remove_duplicates = character(0)) { # Remove specified strings (e.g., "Clostridiales") from the names vector names_vector <- names_vector[!names_vector %in% remove_duplicates] # Create a named vector to keep track of counts for each name name_counts <- numeric(length(names_vector)) names(name_counts) <- names_vector # Initialize a result vector result_names <- character(length(names_vector)) for (i in seq_along(names_vector)) { name <- names_vector[i] # Check if the name exists in name_counts if (!is.na(name_counts[name])) { # Check if the name has occurred before if (name_counts[name] > 1) { # Append a number to make the name unique result_names[i] <- paste0(name, "_", name_counts[name]) } else { result_names[i] <- name } # Update the count for this name name_counts[name] <- name_counts[name] + 1 } else { # If name doesn't exist in name_counts, treat it as the first occurrence result_names[i] <- name name_counts[name] <- 1 } } return(result_names) } asv_taxa_table$Name_f <- make_names_unique(asv_taxa_table$Name) replace_empty_names <- function(names_vector) { # Find indices where names are empty empty_indices <- names_vector == "" # Count occurrences of empty names empty_counts <- cumsum(empty_indices) # Replace empty names with "Taxon" followed by count names_vector[empty_indices] <- paste0("Taxon ", empty_counts[empty_indices]) return(names_vector) } asv_taxa_table$Name_comp <-replace_empty_names(asv_taxa_table$Name_f) make_names_unique3 <- function(names_vector) { # Initialize a counter to keep track of occurrences of each name name_counts <- list() # Initialize a result vector to store the modified names result_names <- character(length(names_vector)) for (i in seq_along(names_vector)) { name <- names_vector[i] # Check if the name is empty or has occurred before if (name == "" || is.null(name_counts[[name]]) || name_counts[[name]] > 0) { # Increment the count for this name count <- ifelse(is.null(name_counts[[name]]), 1, name_counts[[name]] + 1) name_counts[[name]] <- count # Append a number to make the name unique result_names[i] <- paste0(name, ".", count) } else { result_names[i] <- name # First occurrence of this name, initialize count to 1 name_counts[[name]] <- 1 } } return(result_names) } asv_taxa_table$Name_comp1 <- make_names_unique3(asv_taxa_table$Name_comp)
Now we can start fomatting our Y (response data) ready for MrIML. This includes filtering rare and common taxa.
#make sure names match final_ASV_table <- asv_data final_ASV_table$ASV <- asv_taxa_table$Name_comp1 #add row names final_ASV_table_r <- final_ASV_table %>% column_to_rownames(var='ASV') #get into right format for MrIML final_ASV_table_df <-as.data.frame(t(final_ASV_table_r )) #make presence/absence pa_ASV_table <- final_ASV_table_df %>% mutate_all(~ ifelse(. > 0, 1, .)) #remove rare and common ASVs Y <- filterRareCommon(pa_ASV_table , lower=0.1, higher=0.9) %>% dplyr::select(sort(names(.))) #0.2 colnames(Y) <- sub(".*_(.*_.*)$", "\\1", colnames(Y)) ## Find duplicated column names duplicated_cols <- duplicated(colnames(Y)) colnames(Y) <- make.names(colnames(Y), unique = TRUE) #more tidying Y <- Y %>% rename_all(~make.names(str_remove_all(., "`"))) # Shorten the part to the left of the underscore to 4 characters abbreviate_names <- function(names_vector) { abbreviated_names <- sapply(strsplit(names_vector, "_"), function(parts) { first_part <- substr(parts[1], 1, 4) second_part <- parts[2] return(paste(first_part, second_part, sep = "_")) }) return(abbreviated_names) } # Use the function to shorten the names( modified_col_names <- abbreviate_names(colnames(Y)) colnames(Y) <- modified_col_names glimpse(Y)
Now we are good to go and the MrIML 2.0 grahical network models can be generated!
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.